# download_jecs_ojs_universal_no_sections.py
# JECS/OJS (Journal of Education Culture and Society) Universal Downloader
# Universal OJS downloader developed for JECS and similar journals
# - Parses OJS issue TOC and article pages
# - Handles /view/ to /download/ galley link conversion
# - Skips Editorials and Book Reviews
# - Uses only article titles for filenames (no section prefixes)

"""

Supported:
- OJS 2.x and 3.x platforms
- Issues with direct PDF links in Table of Contents
- Issues requiring article-page PDF extraction

Usage:
1. Run the script.
2. Enter the full issue URL when prompted (e.g., https://www.jecs.pl/index.php/jecs/issue/view/39).
3. All PDFs will be downloaded into a folder named after the issue title.
4. Filenames will include only sanitized article titles (no section names).

Notes:
- Automatically handles /view/ to /download/ conversion for OJS galleys.
- Uses a fallback method to open article pages if direct PDF link is missing.
"""


import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

def sanitize(text):
    return re.sub(r'[\\/*?:"<>|]', "", text).strip()

# --- Ask for issue URL ---
issue_url = input("Enter the OJS issue URL: ").strip()

# --- Fetch page ---
resp = requests.get(issue_url, headers=HEADERS)
soup = BeautifulSoup(resp.text, "html.parser")

# --- Base URL ---
parsed = urlparse(issue_url)
base_url = f"{parsed.scheme}://{parsed.netloc}"

# --- Folder based on issue title ---
issue_title_tag = soup.find("h1")
if issue_title_tag:
    folder = sanitize(issue_title_tag.get_text(strip=True))[:200]
else:
    folder = "OJS_Issue"
os.makedirs(folder, exist_ok=True)

count = 0

# --- Iterate through sections and articles ---
for art in soup.select("div.obj_article_summary"):
    title_tag = art.find("h3", class_="title")
    if not title_tag:
        continue
    title = sanitize(title_tag.get_text(" ", strip=True))[:200]

    # Check for direct PDF link in TOC
    pdf_link_tag = art.select_one("a.obj_galley_link.pdf")
    if pdf_link_tag:
        pdf_url = urljoin(base_url, pdf_link_tag["href"].replace("/view/", "/download/"))
    else:
        # Fallback: open article page to find PDF
        art_url = urljoin(base_url, title_tag.find("a")["href"])
        art_resp = requests.get(art_url, headers=HEADERS)
        art_soup = BeautifulSoup(art_resp.text, "html.parser")
        pdf_link = art_soup.find("a", string=lambda t: t and "PDF" in t)
        if not pdf_link:
            print(f"⚠️ No PDF for: {title}")
            continue
        pdf_url = urljoin(base_url, pdf_link["href"])

    filename = f"{title}.pdf"
    path = os.path.join(folder, filename)

    print(f"[{count+1}]⬇️ Downloading: {filename}")
    try:
        pdf = requests.get(pdf_url, headers=HEADERS)
        if "application/pdf" not in pdf.headers.get("Content-Type", ""):
            print(f"❌ Skipped (not PDF): {title}")
            continue
        with open(path, "wb") as f:
            f.write(pdf.content)
        count += 1
        print(f"✅ Saved: {filename}")
    except Exception as e:
        print(f"❌ Error downloading {title}: {e}")

print(f"\n🎉 Done! {count} PDFs saved in {folder}")
